joseph.ergo@proton.me | Portfolio | Resume PDF | Linked-In | +212 713-617-633

Available immediately for full/part-time remote roles

Show the code
## SETUP
from pathlib import Path
import duckdb
from tqdm.notebook import tqdm
import datetime
import copy
import polars as pl
import plotly.express as px
import plotly.io as pio
import re
from concurrent.futures import ThreadPoolExecutor
import plotly.graph_objects as go
import networkx as nx
import numpy as np
# pio.renderers.default = 'plotly_mimetype'
pio.renderers.default = 'jupyterlab+notebook'
pio.templates.default = "plotly_white"

path_data = Path.cwd()/'data'/'03_rdb'
path_data_companies = path_data/'companies_table.parquet'
path_data_experience = path_data/'experience_table.parquet'
path_data_emails = path_data/'emails_table.parquet'
path_data_education = path_data/'education_table.parquet'
path_data_school = path_data/'school_table.parquet'
path_data_persona = path_data/'persona_table.parquet'
path_data_profiles = path_data/'profiles_table.parquet'

path_output_images = Path.cwd()/'output'/'images'

conn = duckdb.connect()

conn.execute("SET temp_directory = 'temp';")
conn.execute("SET memory_limit = '10GB';")
conn.execute("SET max_temp_directory_size = '100GB';")
conn.execute("SET threads = 8;")
conn.execute("SET preserve_insertion_order = false;")
conn.execute("SET enable_progress_bar = true;")
conn.execute("SET enable_progress_bar_print = true;")
Show the code
df = pl.read_parquet('03_target_companies3.parquet')
df_yearly_new_hires_per_indestry = pl.read_parquet('03_yearly_new_hires_per_indestry.parquet')
Show the code
current_company_id = "&-friends"
current_company_id = pl.read_json("04__control__.json")[0,'current_company_id']
query = f"""
SELECT *
FROM read_parquet('{path_data_companies}')
WHERE company_id = '{current_company_id}'
"""
df_company_by_company_id = pl.DataFrame(conn.execute(query).df())

current_company_name = df_company_by_company_id[0,'company_name']
current_company_indestry = df_company_by_company_id[0,'company_industry']

current_company_parquet = Path.cwd()/'output'/'company_data'/f"{current_company_id}.parquet"
Show the code
# Info about personas status from company_id
Show the code
query = f"""
SELECT *
FROM read_parquet('{path_data_experience}')
WHERE company_id = '{current_company_id}'
"""
df_experiences_by_company_id = pl.DataFrame(conn.execute(query).df())
Show the code
personas_whitout_end_date = df_experiences_by_company_id.filter(pl.col('end_date').is_null())
personas_who_got_raise = df_experiences_by_company_id.filter((pl.col('end_date').is_not_null()) &
                                     pl.col('persona_id').is_in(personas_whitout_end_date['persona_id'].to_list()))
personas_who_stayed = (pl
                      .concat([personas_whitout_end_date, personas_who_got_raise])
                      .sort('start_date')
                      .group_by('persona_id')
                      .agg(
                          pl.col('title_name').last(),
                          pl.col('is_primary').last(),
                          pl.col('start_date').min(),
                          pl.col('end_date').max(),
                          pl.col('title_name').count().alias('changes'),
                          pl.col('title_name').unique().alias('all_title_name'),
                      )
                      .with_columns(
                          pl.lit(True).alias('still_associated'),
                          pl.lit(None).alias('end_date')
                      )
                      .sort('changes')
                             )
Show the code
personas_who_left = df_experiences_by_company_id.filter((pl.col('end_date').is_not_null()) & ~pl.col('persona_id').is_in(personas_who_stayed['persona_id'].to_list()) )
personas_who_left = (personas_who_left
                     .sort('start_date')
                     .group_by('persona_id')
                     .agg(
                          pl.col('title_name').last(),
                          pl.col('is_primary').last(),
                          pl.col('start_date').min(),
                          pl.col('end_date').max(),
                          pl.col('title_name').count().alias('changes'),
                          pl.col('title_name').unique().alias('all_title_name'),
                              )
                     .with_columns(
                         pl.lit(False).alias('still_associated'),
                         
                     )
                     .sort('changes'))
Show the code
df_personas_who_worked_in_company = pl.concat([personas_who_stayed, personas_who_left], how='vertical_relaxed').with_columns(
    (pl.col('end_date').dt.year()-pl.col('start_date').dt.year()).alias('work_durration')
).sort('work_durration')
Show the code
import dns.resolver
import smtplib
import socket

def check_deliverability(email_address):
    """
    Checks the deliverability of an email address by verifying MX records
    and performing an SMTP connection test.
    """
    if '@' not in email_address:
        return False
    
    domain = email_address.split('@')[1]
    
    # Check for MX records
    try:
        mx_records = dns.resolver.resolve(domain, 'MX')
        if not mx_records:
            return False
    except (dns.resolver.NoAnswer, dns.resolver.NXDOMAIN, dns.resolver.Timeout):
        return False

    # Perform SMTP connection test
    mx_host = str(mx_records[0].exchange)
    
    # Validate MX hostname before attempting connection
    try:
        # Test if hostname can be properly encoded
        mx_host.encode('idna')
    except UnicodeError:
        return False
    
    try:
        with smtplib.SMTP(mx_host, timeout=10) as smtp:
            smtp.set_debuglevel(0)
            smtp.helo(socket.gethostname())
            smtp.mail('test@example.com')
            code, _ = smtp.rcpt(email_address)

            return code == 250  # 250 indicates valid email address
            
    except (smtplib.SMTPException, socket.error, UnicodeError):
        return False
Show the code
# info of all personas info
list_w = []
for word in df_experiences_by_company_id['persona_id'].unique().to_list():
    list_w.append(f"'{word}'")
list_for_in = ', '.join(list_w)

query = f"""
SELECT *
FROM read_parquet('{path_data_persona}')
WHERE persona_id IN ({list_for_in})
"""
df_all_personas = pl.DataFrame(conn.execute(query).df())

# info of all personas profiles
list_w = []
for word in df_experiences_by_company_id['persona_id'].unique().to_list():
    list_w.append(f"'{word}'")
list_for_in = ', '.join(list_w)

query = f"""
SELECT *
FROM read_parquet('{path_data_profiles}')
WHERE persona_id IN ({list_for_in})
"""
df_all_personas_profile = pl.DataFrame(conn.execute(query).df())
df_all_personas_profile_f = df_all_personas_profile.group_by('persona_id').agg(pl.col('url').unique())

# info of all personas email
list_w = []
for word in df_experiences_by_company_id['persona_id'].unique().to_list():
    list_w.append(f"'{word}'")
list_for_in = ', '.join(list_w)

query = f"""
SELECT *
FROM read_parquet('{path_data_emails}')
WHERE persona_id IN ({list_for_in}) AND type == 'personal'
"""
df_all_personas_emails = pl.DataFrame(conn.execute(query).df())

def def_polars_fix_gmail(x):
    if "@gmail" in x:
        first_part = x.split('@')[0]
        second_part = x.split('@')[1]
        return f"{first_part.replace(".",'')}@{second_part}"
    else:
        return x

df_all_personas_emails_f = (df_all_personas_emails
                            .with_columns(pl.col('address')
                                          .map_elements(def_polars_fix_gmail, return_dtype=pl.String)
                                          .alias('normalised_emails'))
                            .unique('normalised_emails', keep='first')
                            .sort('persona_id')
                            .drop('normalised_emails')
                         )
df_all_personas_emails_f = (df_all_personas_emails_f.group_by('persona_id').agg(pl.col('address').unique(),pl.col('type').unique()))
df_all_personas_plus = df_all_personas.join(df_all_personas_emails_f, on='persona_id', how='left')

df_full_personas_who_worked_in_company = (df_personas_who_worked_in_company
                                       .join(df_all_personas_plus, on='persona_id', how='left')
                                       .join(df_all_personas_profile_f, on='persona_id', how='left')
                                      )

df_full_personas_who_worked_in_company = (
    df_full_personas_who_worked_in_company.with_columns(
        (pl.col("start_date").fill_null(pl.col("start_date").min()))
        .dt.year()
        .alias("start_year"),
        (pl.col("end_date").dt.year()).alias("end_year"),
    )
)

work_years = []
for i in range(len(df_full_personas_who_worked_in_company)):
    start_y = df_full_personas_who_worked_in_company[i, "start_year"]
    if df_full_personas_who_worked_in_company[i, "end_year"]:
        end_y = df_full_personas_who_worked_in_company[i, "end_year"]
    else:
        end_y = 2020

    tmp_work_years = []
    for y in range(start_y, end_y + 1):
        tmp_work_years.append(y)

    work_years.append(tmp_work_years)

df_full_personas_who_worked_in_company = (
    df_full_personas_who_worked_in_company.with_columns(
        pl.Series("work_years", work_years)
    )
)

# add hireups
title_name_match = ["ceo","chief","founder","owner","president","vp","vice","director",
    "cfo","cto","partner","head of","hr ","human","talent","senior","manager","lead"]

df_full_personas_who_worked_in_company = (df_full_personas_who_worked_in_company
    .with_columns(
        pl.when(pl.col('title_name').str.contains_any(title_name_match)).then(True).otherwise(False).alias("higher_up")
    ))



df_tmp_email_checker = (
    df_full_personas_who_worked_in_company
    .filter(
            pl.col('still_associated')==True,
            pl.col('address').list.len()>0
    )
        ['persona_id','address']
        .explode('address')
)

# if current_company_parquet.exists():
#     df_pre_full_personas_who_worked_in_company = pl.read_parquet(current_company_parquet)
#     list_pre_deliverable_address = df_pre_full_personas_who_worked_in_company['address'].drop_nulls().explode().to_list()
# else:
#     list_pre_deliverable_address = []

# list_of_emails_to_check = df_tmp_email_checker['address'].drop_nulls().to_list()
# list_lists_email_check = []

# var_total_emails = len(list_of_emails_to_check)
# var_current_email_count = 0

# def def_check_and_populate(email_to_check):
#     global list_lists_email_check, var_current_email_count
#     if email_to_check in list_pre_deliverable_address:
#         list_lists_email_check.append([email_to_check, True])
#     elif '@gmail' in email_to_check:
#         list_lists_email_check.append([email_to_check, True])
#     else:
#         try:
#             is_deliverable = check_deliverability(email_to_check)
#             list_lists_email_check.append([email_to_check, is_deliverable])
#         except:
#             list_lists_email_check.append([email_to_check, False])
#     var_current_email_count += 1
#     print(' '*10, end='\r')
#     print(round(var_current_email_count/var_total_emails,5), end='\r')

# with ThreadPoolExecutor(max_workers=20) as executor:
#     results = list(executor.map(def_check_and_populate, list_of_emails_to_check))

# df_email_check = pl.DataFrame(list_lists_email_check, schema=["address", "deliverable"], orient="row")
# try:
#     df_tmp_email_checker_f = (
#         df_tmp_email_checker
#             .join(df_email_check, on='address')
#             .filter(pl.col('deliverable')==True)
#             .group_by('persona_id').agg(pl.col('address').unique().alias("deliverable_address"))
#     )
# except:
#     df_tmp_email_checker_f = pl.DataFrame()

# if df_tmp_email_checker_f.is_empty():
#     df_full_personas_who_worked_in_company = df_full_personas_who_worked_in_company.join(df_tmp_email_checker.rename({'address':'deliverable_address'}), on="persona_id", how='left')
# else:
#     df_full_personas_who_worked_in_company = df_full_personas_who_worked_in_company.join(df_tmp_email_checker_f, on="persona_id", how='left')
Show the code
# Info about personas experiences
Show the code
# info of all experiences[]
list_w = []
for word in df_experiences_by_company_id['persona_id'].unique().to_list():
    list_w.append(f"'{word}'")

list_for_in = ', '.join(list_w)
query = f"""
SELECT *
FROM read_parquet('{path_data_experience}')
WHERE persona_id IN ({list_for_in})
"""
df_all_personas_experiences = pl.DataFrame(conn.execute(query).df())


# info of all comapnies in said experiences
list_w = []
for word in df_all_personas_experiences['company_id'].unique().to_list():
    if "'" not in word:
        list_w.append(f"'{word}'")

list_for_in = ', '.join(list_w)
query = f"""
SELECT company_id, company_name, company_industry, company_linkedin_url, company_location_country
FROM read_parquet('{path_data_companies}')
WHERE company_id IN ({list_for_in})
"""
df_all_companies = pl.DataFrame(conn.execute(query).df())

df_full_personas_experiences_plus = df_all_personas_experiences.join(df_all_companies, on='company_id', how='left')

df_full_personas_experiences_plus = (
    df_full_personas_experiences_plus
    .with_columns(
        pl.when(
            pl.col('company_id')==current_company_id
        )
        .then(True)
        .otherwise(False)
        .alias('target')
    )
)
Show the code
# Info about personas education
Show the code
# info of all experiences
list_w = []
for word in df_experiences_by_company_id['persona_id'].unique().to_list():
    list_w.append(f"'{word}'")

list_for_in = ', '.join(list_w)
query = f"""
SELECT *
FROM read_parquet('{path_data_education}')
WHERE persona_id IN ({list_for_in})
"""
df_all_personas_education = pl.DataFrame(conn.execute(query).df())


#ifon of allcomapnies in said experiences
list_w = []
for word in df_all_personas_education['school_id'].unique().to_list():
    if "'" not in word:
        list_w.append(f"'{word}'")

if list_w:
    list_for_in = ', '.join(list_w)
    query = f"""
    SELECT school_id, school_name, school_type, school_website, school_location_country
    FROM read_parquet('{path_data_school}')
    WHERE school_id IN ({list_for_in})
    """
    df_all_school = pl.DataFrame(conn.execute(query).df())
    
    df_full_personas_education_plus = df_all_personas_education.join(df_all_school, on='school_id', how='left')
else:
    df_full_personas_education_plus = df_all_personas_education

1 About the project

The project came to life after realizing that web scraping doesn’t allow deep-level filtering—without consuming too much time.The irony is, this project itself took me about a month, but the final RDB contains more data than I could ever scrape.

The raw data was 1.4 TB in size and holds information previously scraped.
Processing was done on my local machine using Python, Polars, and DuckDB, following this workflow:
- Processed raw data into structured Parquet files using Polars.
- Transformed each Parquet file into mini RDBs using Polars.
- Merged all mini RDBs into one using DuckDB.
- Analyzed and filtered data to fit the current project.

Alt text Alt text Alt text Alt text

2 EDA

2.1 information technology and services indestry’s yearly new recruit count

Show the code
list_of_unique_company_experience_years = []
for y in df_full_personas_who_worked_in_company['start_year'].unique().drop_nulls().to_list():
    if y not in list_of_unique_company_experience_years:
        list_of_unique_company_experience_years.append(y)
for y in df_full_personas_who_worked_in_company['end_year'].unique().drop_nulls().to_list():
    if y not in list_of_unique_company_experience_years:
        list_of_unique_company_experience_years.append(y)

list_year = []
list_state = []
list_count = []
list_names = []

def def_get_names_breked(tmp):
    if tmp.is_empty():
        names_string = ''
    else:
        tmp_list_name = []
        names_limit = 3
        row_limit = names_limit * 6
        for i, name in enumerate(tmp['full_name'].to_list()):
            ii = i+1
            tmp_list_name.append(name.title())
            if ii!=0 and ii%names_limit==0:
                tmp_list_name.append("<br>")
            if ii==row_limit:
                tmp_list_name.append("...")
                break
        names_string = ', '.join(tmp_list_name).replace(", <br>, ","<br>")
    return names_string

for y in list_of_unique_company_experience_years:
    #recuite state
    list_year.append(y)
    list_state.append('Recruited')
    tmp = df_full_personas_who_worked_in_company.filter(pl.col('start_year')==y).sort('full_name')
    list_count.append(len(tmp))
    list_names.append(def_get_names_breked(tmp))
    
    #recuite state
    list_year.append(y)
    list_state.append('Resigned')
    tmp = df_full_personas_who_worked_in_company.filter(pl.col('end_year')==y).sort('full_name')
    list_count.append(len(tmp))
    list_names.append(def_get_names_breked(tmp))

df_m_recruite_vs_resign = pl.DataFrame({
    'year':list_year,
    'status':list_state,
    'count':list_count,
    'names':list_names,})

2.2 nearform’s workforce status over the years

3 Persona company network graph

Show the code
gr_net = df_full_personas_experiences_plus.with_columns(pl.col('company_id').str.to_uppercase()).group_by('persona_id','company_id').agg(pl.len().alias('count')).sort('count')
list_top_in_network = gr_net['company_id'].value_counts().sort('count', descending=True)['company_id'].to_list()[:5]
gr_net_f = gr_net.filter(pl.col('company_id').is_in(list_top_in_network))

list_letters = ['A','B','C','D','E','F','G','H']
dict_company = {}
dict_company_rev = {}
for company, letter in zip(list_top_in_network, list_letters ):
    dict_company[letter] = company
    dict_company_rev[company] = letter

gr_gr_net_f = gr_net_f.sort('company_id').group_by('persona_id').agg(pl.col('company_id').unique().sort(),)

gr_gr_net_f2 = (
    gr_gr_net_f['company_id']
    .value_counts()
    .with_columns(
        # pl.col('company_id').list.join(', '),
        (pl.col('count')/len(gr_gr_net_f)).alias('per')
    )
    .sort('per',descending=True)
)

list_prob = []
for i in range(len(gr_gr_net_f2)):
    tmp_prob_letters = []
    for k in dict_company.keys():
        if dict_company[k] in gr_gr_net_f2[i]['company_id'][0].to_list():
            tmp_prob_letters.append(f' {k}')
        else:
            tmp_prob_letters.append(f'¬{k}')

    list_prob.append(f"P({' ∩ '.join(tmp_prob_letters)}) = {round(gr_gr_net_f2[i]['per'][0],4)}")
annon_prob_text = "<b>Probability Distribution:</b><br>" + '<br>'.join(list_prob)



# Create network graph
G = nx.Graph()
for persona, company in gr_net_f.select(['persona_id', 'company_id']).iter_rows():
    G.add_edge(persona, company)

# Get unique values
persona_ids = gr_net_f['persona_id'].unique().to_list()
company_ids = gr_net_f['company_id'].unique().to_list()

# Calculate degrees (connection counts)
degree_dict = dict(G.degree())

# Get min and max degrees for scaling
company_degrees = [degree_dict[c] for c in company_ids]
persona_degrees = [degree_dict[p] for p in persona_ids]

min_company_degree = min(company_degrees) if company_degrees else 1
max_company_degree = max(company_degrees) if company_degrees else 1
min_persona_degree = min(persona_degrees) if persona_degrees else 1
max_persona_degree = max(persona_degrees) if persona_degrees else 1

# Define size ranges
COMPANY_MIN_SIZE = 25
COMPANY_MAX_SIZE = 100
PERSONA_MIN_SIZE = 5
PERSONA_MAX_SIZE = 20

# print(f"Company connections range: {min_company_degree} - {max_company_degree}")
# print(f"Persona connections range: {min_persona_degree} - {max_persona_degree}")

# Sort companies by degree (size) in descending order
company_ids_sorted = sorted(company_ids, key=lambda x: degree_dict[x], reverse=True)

# Check if "Nokia" exists in the data
HIGHLIGHTED_COMPANY = current_company_id
HIGHLIGHTED_COMPANY_EXISTS = HIGHLIGHTED_COMPANY.lower() in [str(c).lower() for c in company_ids]

if HIGHLIGHTED_COMPANY_EXISTS:
    # Get the actual case-sensitive name
    highlighted_company = next(c for c in company_ids if str(c).lower() == HIGHLIGHTED_COMPANY.lower())
    # print(f"Highlighting company: {highlighted_company} (with {degree_dict[highlighted_company]} connections)")
else:
    # print(f"Warning: '{HIGHLIGHTED_COMPANY}' not found in company list")
    highlighted_company = None

# Create layout (companies on outer circle, ordered by size)
pos = {}
num_companies = len(company_ids_sorted)
radius_outer = 2.0

# Position companies on circle, ordered by size (largest first)
for i, company in enumerate(company_ids_sorted):
    # Start at top (90° or π/2 radians) and go counter-clockwise (add angle)
    # Counter-clockwise rotation: angle = start_angle + (i * 2π / num_companies)
    # This puts largest at top, next on left, then bottom, then right
    start_angle = np.pi / 2  # 90° at top
    
    # For counter-clockwise rotation
    angle = start_angle - (2 * np.pi * i / num_companies)
    
    # Convert to x, y coordinates
    pos[company] = (radius_outer * np.cos(angle), radius_outer * np.sin(angle))

# Position personas
for i, persona in enumerate(persona_ids):
    connected_companies = [c for c in company_ids if G.has_edge(persona, c)]
    if connected_companies:
        avg_x = np.mean([pos[c][0] for c in connected_companies])
        avg_y = np.mean([pos[c][1] for c in connected_companies])
        # Add jitter to spread out personas
        jitter_x = np.random.uniform(-0.2, 0.2)
        jitter_y = np.random.uniform(-0.2, 0.2)
        pos[persona] = (avg_x * 0.5 + jitter_x, avg_y * 0.5 + jitter_y)
    else:
        pos[persona] = (0, 0)

# Prepare edge traces
edge_x, edge_y = [], []
for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.extend([x0, x1, None])
    edge_y.extend([y0, y1, None])

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.6, color='rgba(120, 120, 120, 0.15)'),
    hoverinfo='none',
    mode='lines')

# Prepare node traces with proportional sizing
company_x, company_y, company_text = [], [], []
company_color, company_size, company_hover = [], [], []
company_border_width = []  # For border thickness
company_border_color = []  # For border color

persona_x, persona_y = [], []
persona_color, persona_size, persona_hover = [], [], []

# Helper function to scale size proportionally
def scale_size(value, min_val, max_val, min_size, max_size):
    if max_val == min_val:
        return (min_size + max_size) / 2
    return min_size + (value - min_val) / (max_val - min_val) * (max_size - min_size)

# Add COMPANY nodes in sorted order (largest first)
for company in company_ids_sorted:
    x, y = pos[company]
    company_x.append(x)
    company_y.append(y)
    company_text.append(str(company))
    company_color.append('#EF553B')
    
    connections = degree_dict[company]
    # Scale size based on connection count
    scaled_size = scale_size(
        connections, 
        min_company_degree, 
        max_company_degree,
        COMPANY_MIN_SIZE, 
        COMPANY_MAX_SIZE
    )
    company_size.append(scaled_size)
    
    # Custom border for highlighted company
    if highlighted_company and company == highlighted_company:
        company_border_width.append(4)  # Thicker border
        company_border_color.append('#000000')  # Black border
    else:
        company_border_width.append(1)
        company_border_color.append('#000000')
    
    # Hover text
    personas = gr_net_f.filter(pl.col('company_id') == company)['persona_id'].to_list()
    rank = company_ids_sorted.index(company) + 1
    hover_text = f"<b>Company #{rank}:</b> {company}<br>"
    hover_text += f"<b>Personas worked here:</b> {connections}<br>"
    hover_text += f"<b>Connection rank:</b> {rank}/{len(company_ids_sorted)}<br>"
    if connections > 0:
        for persona in personas[:5]:
            persona_name = df_all_personas.filter(pl.col('persona_id')==persona)['full_name'][0].title()
            hover_text += f" • {persona_name}<br>"
        if connections > 5:
            hover_text += f" • ... and {connections - 5} more"
    company_hover.append(hover_text)

# Add PERSONA nodes
for persona in persona_ids:
    x, y = pos[persona]
    persona_x.append(x)
    persona_y.append(y)
    persona_color.append('#636efa')
    
    connections = degree_dict[persona]
    # Scale size based on connection count
    scaled_size = scale_size(
        connections,
        min_persona_degree,
        max_persona_degree,
        PERSONA_MIN_SIZE,
        PERSONA_MAX_SIZE
    )
    persona_size.append(scaled_size)
    
    # Hover text
    companies = gr_net_f.filter(pl.col('persona_id') == persona)['company_id'].to_list()
    persona_name = df_all_personas.filter(pl.col('persona_id')==persona)['full_name'][0].title()
    hover_text = f"<b>Persona:</b> {persona_name}<br>"
    hover_text += f"<b>Companies worked at:</b> {connections}<br>"
    if connections > 0:
        # Check if worked at highlighted company
        if highlighted_company:
            worked_at_highlighted = highlighted_company in companies
            if worked_at_highlighted:
                hover_text += f"<b>Worked at {highlighted_company}:</b> ✓<br>"
        
        hover_text += "<br>".join([f"  • {comp}" for comp in companies[:5]])
        if connections > 5:
            hover_text += f"<br>  • ... and {connections - 5} more"
    persona_hover.append(hover_text)

# Create company node trace
company_trace = go.Scatter(
    x=company_x, y=company_y,
    mode='markers+text',
    hoverinfo='text',
    hovertext=company_hover,
    text=company_text,
    textposition="top center",
    textfont=dict(size=14, color='black'),
    marker=dict(
        color=company_color,
        size=company_size,
        line=dict(
            width=company_border_width,
            color=company_border_color
        ),
        opacity=0.9)
)

# Create persona node trace
persona_trace = go.Scatter(
    x=persona_x, y=persona_y,
    mode='markers',
    hoverinfo='text',
    hovertext=persona_hover,
    text=None,  # No text for personas
    marker=dict(
        color=persona_color,
        size=persona_size,
        line=dict(width=1, color='black'),
        opacity=0.7)
)

# Calculate axis ranges for 1:1 aspect ratio
all_positions = list(pos.values())
x_vals = [p[0] for p in all_positions]
y_vals = [p[1] for p in all_positions]

# Add padding
x_range = [min(x_vals) - 0.5, max(x_vals) + 0.5]
y_range = [min(y_vals) - 0.5, max(y_vals) + 0.5]

# Make axes have the same range for 1:1 aspect
max_range = max(x_range[1] - x_range[0], y_range[1] - y_range[0])
x_center = (x_range[0] + x_range[1]) / 2
y_center = (y_range[0] + y_range[1]) / 2

x_range = [x_center - max_range/2, x_center + max_range/2]
y_range = [y_center - max_range/2, y_center + max_range/2]

# Create figure with 1:1 aspect ratio
fig = go.Figure(data=[edge_trace, persona_trace, company_trace],
                layout=go.Layout(
                    title=f'Persona-Company Network (Companies Ordered by Size)<br><sup>Highlighted: {highlighted_company if highlighted_company else "None"}</sup>',
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=20, l=20, r=20, t=100),
                    xaxis=dict(
                        showgrid=False, 
                        zeroline=False, 
                        showticklabels=False,
                        range=x_range,
                        scaleanchor="y",
                        scaleratio=1
                    ),
                    yaxis=dict(
                        showgrid=False, 
                        zeroline=False, 
                        showticklabels=False,
                        range=y_range
                    ),
                    plot_bgcolor='white',
                    paper_bgcolor='white',
                    width=900,
                    height=900
                ))

# Add legend with size examples and highlighting info
# legend_text = f"""
# <b>Node Size = Connection Count</b><br>
# <span style='color:#EF553B'>● Companies</span><br>
# <span style='color:#636efa'>● Personas</span> (hover for details)
# """

# fig.add_annotation(
#     x=0.98, y=0.98,
#     xref="paper", yref="paper",
#     text=legend_text,
#     showarrow=False,
#     font=dict(size=14),
#     align="left",
#     bgcolor="rgba(255, 255, 255, 0.95)",
    
# )

# Add top companies list
top_companies = company_ids_sorted[:10]  # Top 10 companies
top_companies_text = "<b>Top Companies by Connections:</b><br>"
for i, company in enumerate(top_companies, 1):
    connections = degree_dict[company]
    top_connections = degree_dict[top_companies[0]]
    connections_per = f" | {round(connections/top_connections*100)}%" if highlighted_company and company != highlighted_company else ""
    highlight_indicator = " " if highlighted_company and company == highlighted_company else ""
    top_companies_text += f"{dict_company_rev[company]}. {company}: {connections} {connections_per} {highlight_indicator}<br>"

fig.add_annotation(
    x=0.02, y=0.98,
    xref="paper", yref="paper",
    text=top_companies_text,
    showarrow=False,
    font=dict(size=14),
    align="left",
    bgcolor="rgba(255, 255, 255, 0.9)",
    # bordercolor="#666",
    # borderwidth=1
)

# Add probabiliy list

fig.add_annotation(
    x=0.98, y=0.98,
    xref="paper", yref="paper",
    text=annon_prob_text,
    showarrow=False,
    font=dict(
        family="'Courier New', monospace",  # Multiple fallbacks
        size=12,
        color="black"
    ),
    align="left",
    bgcolor="rgba(255, 255, 255, 0.95)",
    
)
fig.write_image((path_output_images/f'network_{current_company_id}.webp'))
fig.show()
Show the code
amount = 5

tmp = df_full_personas_who_worked_in_company.sort(
    ["inferred_salary", "linkedin_connections", "inferred_years_experience"],
    descending=True,
)
tmp_gr = df_full_personas_experiences_plus.group_by('persona_id').agg(pl.len().alias('experience_count'))
tmp = df_full_personas_who_worked_in_company.join(tmp_gr, on='persona_id').sort('experience_count',descending=True)

tmp2 = pl.concat(
    [tmp.filter(pl.col('still_associated')==True, pl.col('higher_up')==True)[:amount*2],
     tmp.filter(pl.col('still_associated')==True, pl.col('higher_up')==False)[:amount*2],
     tmp.filter(pl.col('still_associated')==False, pl.col('higher_up')==True)[:amount*1],
     tmp.filter(pl.col('still_associated')==False, pl.col('higher_up')==False)[:amount*1],
     tmp.filter(pl.col('title_name').str.contains_any(['founder','ceo','presi','owner']))
    ]
).sort("full_name")

list_persona_for_plot = tmp2['persona_id'].to_list()
Show the code
# Workforce data
Show the code
def def_plotly_experience_range(current_persona_id):
    tmp_df = (df_full_personas_who_worked_in_company
              .filter(pl.col('persona_id')==current_persona_id)
              .with_columns(pl.col('end_year').fill_null(2021))['start_year','end_year'])
    
    fig_tmp = copy.deepcopy(fig_company_hiring_trend)
    fig_tmp.add_vrect(
        x0=tmp_df[0,'start_year'],
        x1=tmp_df[0,'end_year'],
        fillcolor="blue",
        opacity=0.1,
        line_width=0 
    )
    return fig_tmp

def def_plotly_experience_gantt(current_persona_id):
    px_data = (df_full_personas_experiences_plus
               .filter(pl.col('persona_id')==current_persona_id)
               .with_columns(
                   pl.col('end_date').fill_null(datetime.datetime(2020, 1, 1, 0,0)),
                   pl.col('company_name').str.to_uppercase(),
                   # pl.col('company_name').str.to_uppercase().str.replace_all('&', '-and-')
               )
               .sort('start_date'))
    
    y_order = px_data['company_name'].to_list()
    
    fig = px.timeline(px_data,x_start="start_date", x_end="end_date", y="company_name",
                      color='target',hover_data=["title_name"], height=140+30*len(px_data),
                      category_orders={"company_name": y_order},
                      color_discrete_map={True:'#EF553B',  False:'#636efa'},
                      labels={'target':'Target', 'start_date':'Recruited', 'end_date':'If-Resigned', 
                             'company_name':'Company', 'title_name':'Job role'}
                     # title=f"Experience of {current_persona_name}.",
                     )
    fig.update_yaxes(
        # autorange="reversed",
                              showgrid=True,
                              gridcolor='lightgray',
                              gridwidth=1,
                              griddash='dot'
    )
    fig.update_layout(showlegend=False, xaxis_title=None, yaxis_title=None)
    return fig

4 Workforce sample

4.1 Adam Davis

Job title: Developer
Socials: https://meetup.com/members/13688401 | https://linkedin.com/in/adam-davis-91b3937 | https://linkedin.com/in/admataz | https://flickr.com/people/admataz | https://twitter.com/admataz | https://dribbble.com/admataz | https://github.com/admataz | https://stackoverflow.com/users/296390

4.1.1 Adam Davis’s working period at nearform

4.1.2 Gantt plot of Adam Davis’s experience


4.2 Adrian Rossouw

Job title: Delivery architect
Socials: https://linkedin.com/in/adrian-rossouw-a7013 | https://facebook.com/adrian.rossouw | https://github.com/adrianrossouw | https://linkedin.com/in/adrianrossouw | https://twitter.com/adrianrossouw | https://gravatar.com/nodecpt | https://flickr.com/people/vertice

4.2.1 Adrian Rossouw’s working period at nearform

4.2.2 Gantt plot of Adrian Rossouw’s experience


4.3 Adrien Becchis

Job title: Ingã nieur logiciel open source
Socials: https://linkedin.com/in/adrien-becchis-1471b668 | https://linkedin.com/in/adrienbecchis

4.3.1 Adrien Becchis’s working period at nearform

4.3.2 Gantt plot of Adrien Becchis’s experience


4.4 Alessio Carnevale

Job title: Full stack javascript developer
Socials: https://linkedin.com/in/carnevale

4.4.1 Alessio Carnevale’s working period at nearform

4.4.2 Gantt plot of Alessio Carnevale’s experience


4.5 Alex Knol

Job title: Director of devops
Socials: https://about.me/alexknol | https://facebook.com/alexknol | https://youtube.com/user/alexknol37 | https://twitter.com/e_lexy | https://klout.com/e_lexy | https://linkedin.com/in/elexy | https://github.com/elexy | https://gravatar.com/seretalabs

4.5.1 Alex Knol’s working period at nearform

4.5.2 Gantt plot of Alex Knol’s experience


4.6 Anton Whalley

Job title: Vice president of engineering
Socials: https://linkedin.com/in/anton-whalley-a57a674 | https://linkedin.com/in/antonwhalley | https://gravatar.com/mrcowsy | https://github.com/no9

4.6.1 Anton Whalley’s working period at nearform

4.6.2 Gantt plot of Anton Whalley’s experience


4.7 Aoife O’Carroll

Job title: Content writer
Socials: https://facebook.com/aoife.ocarroll.16 | https://twitter.com/aoifecontent | https://linkedin.com/in/aoifeocarroll

4.7.1 Aoife O’Carroll’s working period at nearform

4.7.2 Gantt plot of Aoife O’Carroll’s experience


4.8 Ariel Flesler

Job title: Senior full stack engineer
Socials: https://angel.co/ariel-flesler | https://linkedin.com/in/ariel-flesler-150a8a5 | https://linkedin.com/in/arielflesler | https://twitter.com/flesler | https://github.com/flesler | https://youtube.com/user/flesler | https://stackoverflow.com/users/680468

4.8.1 Ariel Flesler’s working period at nearform

4.8.2 Gantt plot of Ariel Flesler’s experience


4.9 Brian Delahunty

Job title: Lead software architect
Socials: https://foursquare.com/user/24322248 | https://meetup.com/members/39922482 | https://angel.co/brian-delahunty | https://linkedin.com/in/brian-delahunty-8923366 | https://instagram.com/brian_dela | https://github.com/briandela | https://klout.com/briandela | https://flickr.com/people/briandela | https://linkedin.com/in/briandela | https://gravatar.com/briandela | https://twitter.com/briandela | https://about.me/briandela

4.9.1 Brian Delahunty’s working period at nearform

4.9.2 Gantt plot of Brian Delahunty’s experience


4.10 Chico Charlesworth

Job title: Senior nodejs developer
Socials: https://plus.google.com/+08336919638271737618 | https://foursquare.com/user/84099 | https://gravatar.com/ccharlesworth | https://twitter.com/ccharlesworth | https://klout.com/ccharlesworth | https://vimeo.com/ccharlesworth | https://angel.co/ccharlesworth | https://github.com/chico | https://linkedin.com/in/chico-charlesworth-3720055 | https://about.me/chico.charlesworth | https://pinterest.com/chicoc | https://chicocharlesworth.wordpress.com | https://linkedin.com/in/chicocharlesworth | https://facebook.com/chicocharlesworth | https://youtube.com/user/sexiko | https://flickr.com/people/sexiko | https://stackoverflow.com/users/610106

4.10.1 Chico Charlesworth’s working period at nearform

4.10.2 Gantt plot of Chico Charlesworth’s experience


4.11 Colm Harte

Job title: Technical director
Socials: https://linkedin.com/in/colm-harte-b904597 | https://linkedin.com/in/colmharte

4.11.1 Colm Harte’s working period at nearform

4.11.2 Gantt plot of Colm Harte’s experience


4.12 Danese Cooper

Job title: Vice president special initiatives
Socials: https://foursquare.com/user/11716 | https://about.me/danese | https://youtube.com/user/danese | https://linkedin.com/in/danese-cooper-1139 | https://linkedin.com/in/danesecooper | https://facebook.com/danesecooper | https://pinterest.com/divadanese | https://gravatar.com/divadanese | https://klout.com/divadanese | https://twitter.com/divadanese | https://myspace.com/zelody_pictures

4.12.1 Danese Cooper’s working period at nearform

4.12.2 Gantt plot of Danese Cooper’s experience


4.13 Danese Cooper

Job title: Vice president special initiatives
Socials: https://foursquare.com/user/11716 | https://about.me/danese | https://youtube.com/user/danese | https://linkedin.com/in/danese-cooper-1139 | https://linkedin.com/in/danesecooper | https://facebook.com/danesecooper | https://pinterest.com/divadanese | https://gravatar.com/divadanese | https://klout.com/divadanese | https://twitter.com/divadanese | https://myspace.com/zelody_pictures

4.13.1 Danese Cooper’s working period at nearform

4.13.2 Gantt plot of Danese Cooper’s experience


4.14 Denise Dresbach

Job title: Quality assurance engineer
Socials: https://indeed.com/r/denise-dresbach | https://linkedin.com/in/denise-dresbach-8105963 | https://facebook.com/denise.dresbach | https://github.com/denise1212 | https://linkedin.com/in/denisedresbach | https://pinterest.com/denisedresbach

4.14.1 Denise Dresbach’s working period at nearform

4.14.2 Gantt plot of Denise Dresbach’s experience


4.15 Derek Henderson

Job title: Frontend web developer
Socials: https://linkedin.com/in/derek-henderson

4.15.1 Derek Henderson’s working period at nearform

4.15.2 Gantt plot of Derek Henderson’s experience


4.16 Fiona Kenna

Job title: Chief financial officer
Socials: https://linkedin.com/in/fiona-mc-kenna-174172a2 | https://linkedin.com/in/fiona-mc-kenna-fcca-174172a2

4.16.1 Fiona Kenna’s working period at nearform

4.16.2 Gantt plot of Fiona Kenna’s experience


4.17 Ger O’Shaughnessy

Job title: Head of propositions
Socials: https://twitter.com/geroshaugh | https://linkedin.com/in/geroshaugh

4.17.1 Ger O’Shaughnessy’s working period at nearform

4.17.2 Gantt plot of Ger O’Shaughnessy’s experience


4.18 Guy Ellis

Job title: Node.js and javascript solutions architect
Socials: https://plus.google.com/+guyellis | https://meetup.com/members/12091802 | https://foursquare.com/user/2663334 | https://vimeo.com/user/5691315 | https://quora.com/guy-ellis | https://linkedin.com/in/guy-ellis-17a4291 | https://github.com/guyellis | https://facebook.com/wildfiction | https://youtube.com/user/wildfiction | https://about.me/wildfiction | https://gravatar.com/wildfiction | https://twitter.com/wildfiction | https://klout.com/wildfiction | https://flickr.com/people/wildfiction | https://pinterest.com/wildfiction | https://linkedin.com/in/wildfiction | https://stackoverflow.com/users/1463

4.18.1 Guy Ellis’s working period at nearform

4.18.2 Gantt plot of Guy Ellis’s experience


4.19 Helene Haughney

Job title: Chief operations officer
Socials: https://linkedin.com/in/helenehaughney | https://flickr.com/people/helenehaughney | https://twitter.com/hhaughney

4.19.1 Helene Haughney’s working period at nearform

4.19.2 Gantt plot of Helene Haughney’s experience


4.20 Hugh Mooney

Job title: Devops engineer
Socials: https://linkedin.com/in/hugh-mooney-69b4625 | https://linkedin.com/in/kxseven

4.20.1 Hugh Mooney’s working period at nearform

4.20.2 Gantt plot of Hugh Mooney’s experience


4.21 Jan-Jan Van Der Vyver

Job title: Software developer
Socials: https://linkedin.com/in/dr-jan-jan-van-der-vyver-b92a693 | https://twitter.com/iibtrih | https://angel.co/jan-jan-van-der-vyver | https://linkedin.com/in/janjanvandervyver | https://facebook.com/100005025288171

4.21.1 Jan-Jan Van Der Vyver’s working period at nearform

4.21.2 Gantt plot of Jan-Jan Van Der Vyver’s experience


4.22 Jason Melo

Job title: Vice president of technology and strategic initiatives advisor
Socials: https://vimeo.com/user/355412 | https://linkedin.com/in/jason-melo-1646b41 | https://facebook.com/jason.melo | https://github.com/jasonmelo | https://angel.co/jasonmelo | https://linkedin.com/in/jasonmelo | https://twitter.com/jasonmelo | https://gravatar.com/jasonmeloadp | https://youtube.com/user/lucaandetan | https://stackoverflow.com/users/1886312

4.22.1 Jason Melo’s working period at nearform

4.22.2 Gantt plot of Jason Melo’s experience


4.23 Jeff Simons

Job title: Principal designer - middle east
Socials: https://meetup.com/members/23957672 | https://linkedin.com/in/jeff-simons-a743287 | https://dribbble.com/jeffsimons | https://github.com/jeffsimons | https://twitter.com/jeffsimons | https://linkedin.com/in/jeffsimons | https://flickr.com/people/jeffsimons | https://facebook.com/jffsimons

4.23.1 Jeff Simons’s working period at nearform

4.23.2 Gantt plot of Jeff Simons’s experience


4.24 John Behan

Job title: Senior software engineer
Socials: https://twitter.com/jjmax75 | https://github.com/jjmax75 | https://linkedin.com/in/johnbehan | https://linkedin.com/in/seanbehan75 | https://stackoverflow.com/users/1377969

4.24.1 John Behan’s working period at nearform

4.24.2 Gantt plot of John Behan’s experience


4.25 Koji Wakayama

Job title: Full stack developer
Socials: https://linkedin.com/in/koji-wakayama-037a317 | https://linkedin.com/in/kojiwakayama

4.25.1 Koji Wakayama’s working period at nearform

4.25.2 Gantt plot of Koji Wakayama’s experience


4.26 Lloyd Benson

Job title: Contractor
Socials: https://linkedin.com/in/lloyd-benson | https://linkedin.com/in/lloyd-benson-b6761b1 | https://github.com/lloydbenson | https://gravatar.com/lloydrbenson

4.26.1 Lloyd Benson’s working period at nearform

4.26.2 Gantt plot of Lloyd Benson’s experience


4.27 Matteo Collina

Job title: Technical director
Socials: https://plus.google.com/+00361393645575064105 | https://foursquare.com/user/1186420 | https://meetup.com/members/12326312 | https://quora.com/matteo-collina | https://linkedin.com/in/matteo-collina-1b123018 | https://pinterest.com/matteocollina | https://linkedin.com/in/matteocollina | https://twitter.com/matteocollina | https://angel.co/matteocollina | https://klout.com/matteocollina | https://facebook.com/matteocollina | https://gravatar.com/mcollina | https://about.me/mcollina | https://github.com/mcollina | https://youtube.com/user/ucc-tcj0sugz21uivih0qglq | https://stackoverflow.com/users/908571

4.27.1 Matteo Collina’s working period at nearform

4.27.2 Gantt plot of Matteo Collina’s experience


4.28 Nicolas Herment

Job title: Software architect
Socials: https://foursquare.com/user/5629535 | https://vimeo.com/herment | https://gravatar.com/nherment | https://klout.com/nherment | https://twitter.com/nherment | https://github.com/nherment | https://angel.co/nicolas-herment | https://linkedin.com/in/nicolas-herment-96bb6a5 | https://linkedin.com/in/nicolasherment | https://stackoverflow.com/users/994084

4.28.1 Nicolas Herment’s working period at nearform

4.28.2 Gantt plot of Nicolas Herment’s experience


4.29 Paweł Lasek

Job title: Devops engineer
Socials: https://linkedin.com/in/paullasek | https://twitter.com/pawel_lasek | https://klout.com/pawel_lasek | https://youtube.com/user/pawellasek | https://linkedin.com/in/pawellasek | https://gravatar.com/unya | https://unya.wordpress.com | https://github.com/unya | https://facebook.com/unyanyanya | https://stackoverflow.com/users/131380

4.29.1 Paweł Lasek’s working period at nearform

4.29.2 Gantt plot of Paweł Lasek’s experience


4.30 Rodolfo Melo

Job title: Software engineer
Socials: https://linkedin.com/in/rodolfo-melo-789b1b10b

4.30.1 Rodolfo Melo’s working period at nearform

4.30.2 Gantt plot of Rodolfo Melo’s experience


4.31 Sam Artuso

Job title: Senior software developer
Socials: https://linkedin.com/in/samartuso

4.31.1 Sam Artuso’s working period at nearform

4.31.2 Gantt plot of Sam Artuso’s experience


4.32 Scott Mcwhirter

Job title: Senior systems engineer
Socials: https://gravatar.com/cloudtone | https://twitter.com/konobi | https://linkedin.com/in/konobi | https://angel.co/konobi | https://klout.com/konobi | https://github.com/konobi | https://linkedin.com/in/scott-mcwhirter-37885b3

4.32.1 Scott Mcwhirter’s working period at nearform

4.32.2 Gantt plot of Scott Mcwhirter’s experience


4.33 Sergi Mansilla

Job title: Associate technical director
Socials: https://meetup.com/members/11101309 | https://vimeo.com/user/1645190 | https://flickr.com/people/mansana | https://youtube.com/user/mrclash | https://github.com/sergi | https://linkedin.com/in/sergi-mansilla-84b4743 | https://facebook.com/sergimansilla | https://dribbble.com/sergimansilla | https://twitter.com/sergimansilla | https://linkedin.com/in/smansilla

4.33.1 Sergi Mansilla’s working period at nearform

4.33.2 Gantt plot of Sergi Mansilla’s experience


Show the code
df_full_personas_who_worked_in_company.write_parquet(current_company_parquet)